import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.offline as py
py.init_notebook_mode(connected=False)
CONFIRMED_PATH = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
DEATH_PATH = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
RECOVERED_PATH = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
def get_data_df(data_path):
data_df = pd.read_csv(data_path)
data_df = data_df.rename(columns={'Province/State': 'province_or_state', 'Country/Region': 'country'})
data_df['province_or_state'] = data_df['province_or_state'].fillna('')
data_df = data_df.drop(data_df[data_df['province_or_state'].str.contains('Diamond Princess')].index)
data_df = data_df.drop(data_df[data_df['country'].str.contains('Diamond Princess')].index)
agg_spec = {k : 'sum' for k in data_df.columns[4:]}
data_df = data_df.groupby(['country']).agg(agg_spec).reset_index()
return data_df
def get_weather_df():
weather_df = pd.read_csv('yearly_temp.csv', sep='\t')
weather_df = weather_df.rename(columns=lambda x: x.strip())
weather_df['country'] = weather_df['country'].str.strip()
weather_df['temp'] = pd.to_numeric(weather_df['temp'])
return weather_df
def get_population_df():
population_df = pd.read_csv('population_data.csv', sep='\t')
population_df['Med. Age'] = population_df['Med. Age'].str.replace(',', '').str.replace('N.A', '0').astype(float)
population_df['Urban Pop %'] = population_df['Urban Pop %'].str.replace(',', '').str.replace('N.A', '0').astype(float)
return population_df
def join_data_df_weather(df):
weather_df = get_weather_df()
drop_countries = set(df['country'].unique()) - set(weather_df['country'].unique())
df = df.join(weather_df.set_index('country'), on='country', how='left')
df['hot'] = np.where(df['temp'] > 15.0, True, False)
df = df.drop(df[df['country'].isin(drop_countries)].index)
return df
def join_data_df_population(df):
population_df = get_population_df()
drop_countries = set(df['country'].unique()) - set(population_df['country'].unique())
df = df.join(population_df.set_index('country'), on='country', how='left')
df = df.drop(df[df['country'].isin(drop_countries)].index)
return df
COUNTRIES_OF_INTEREST = ['Italy', 'France', 'China', 'United Kingdom', 'US', 'Germany', 'Spain',
'Japan', 'Israel', 'Netherlands', 'Korea, South']
data_df = get_data_df(CONFIRMED_PATH)
data_df_columns = data_df.columns
weather_population_columns = list(get_weather_df().columns[1:]) + ['hot'] + list(get_population_df().columns[1:])
data_df_t = data_df.melt(id_vars=['country'], var_name='date', value_name='confirmed').fillna('<all>')
data_of_interest = data_df_t[data_df_t['country'].isin(COUNTRIES_OF_INTEREST)]
fig = px.line(data_of_interest, x="date", y="confirmed", color='country', log_y=True)
fig.show()
diff_df = data_df.copy()
for i in range(len(data_df.columns)-1, 1, -1):
diff_df.iloc[:, i] = (data_df.iloc[:, i] - data_df.iloc[:, i-1]) / data_df.iloc[:, i-1]
diff_df = diff_df.fillna(0.0)
diff_df_t = diff_df.melt(id_vars=['country'], var_name='date', value_name='new').fillna('<all>')
diff_of_interest = diff_df_t[diff_df_t['country'].isin(COUNTRIES_OF_INTEREST)]
c = diff_of_interest.groupby('country').cumcount()
week_of_interest = diff_of_interest.groupby(['country', c // 7]).agg({'date': 'first', 'new': 'sum'}).reset_index()
fig = px.line(week_of_interest, x="date", y="new", color='country', log_y=True)
fig.show()
data_df = join_data_df_weather(data_df)
data_df = join_data_df_population(data_df)
data_norm_df = data_df.copy()
for i in range(len(data_df.columns)-len(weather_population_columns), 0, -1):
data_norm_df.iloc[:, i] = pd.to_numeric(data_df.iloc[:, i]) / pd.to_numeric(data_df['Population (2020)'])
data_norm_df_t = data_norm_df.drop(columns=weather_population_columns)
data_norm_df_t = data_norm_df_t.melt(id_vars=['country'], var_name='date', value_name='confirmed').fillna('<all>')
norm_of_interest = data_norm_df_t[data_norm_df_t['country'].isin(COUNTRIES_OF_INTEREST)]
fig = px.line(norm_of_interest, x="date", y="confirmed", color='country', log_y=True)
fig.show()
death_df = get_data_df(DEATH_PATH)
death_df = join_data_df_weather(death_df)
death_df = join_data_df_population(death_df)
death_norm_df = death_df.copy()
for i in range(len(death_df.columns)-len(weather_population_columns), 0, -1):
death_norm_df.iloc[:, i] = pd.to_numeric(death_df.iloc[:, i]) / pd.to_numeric(death_df['Population (2020)'])
death_norm_df_t = death_norm_df.drop(columns=weather_population_columns)
death_norm_df_t = death_norm_df_t.melt(id_vars=['country'], var_name='date', value_name='deaths').fillna('<all>')
death_norm_of_interest = death_norm_df_t[death_norm_df_t['country'].isin(COUNTRIES_OF_INTEREST)]
fig = px.line(death_norm_of_interest, x="date", y="deaths", color='country', log_y=True)
fig.show()
recovered_df = get_data_df(RECOVERED_PATH)
recovered_df = join_data_df_weather(recovered_df)
recovered_df = join_data_df_population(recovered_df)
active_df = data_df.copy()
for i in range(len(data_df.columns)-len(weather_population_columns), 0, -1):
active_df.iloc[:, i] = pd.to_numeric(active_df.iloc[:, i]) - pd.to_numeric(death_df.iloc[:, i]) - pd.to_numeric(recovered_df.iloc[:, i])
active_df_norm_t = active_df.drop(columns=weather_population_columns)
active_df_norm_t = active_df_norm_t.melt(id_vars=['country'], var_name='date', value_name='active').fillna('<all>')
active_norm_of_interest = active_df_norm_t[active_df_norm_t['country'].isin(COUNTRIES_OF_INTEREST)]
fig = px.line(active_norm_of_interest, x="date", y="active", color='country', log_y=True)
fig.show()
active_df.loc["world", data_df_columns[1:]] = active_df[data_df_columns[1:]].sum()
active_df.at['world', 'country'] = 'world'
active_df_norm_t = active_df.drop(columns=weather_population_columns)
active_df_norm_t = active_df_norm_t.melt(id_vars=['country'], var_name='date', value_name='active').fillna('<all>')
active_norm_of_interest = active_df_norm_t[active_df_norm_t['country'].isin(['world'])]
fig = px.line(active_norm_of_interest, x="date", y="active", color='country', log_y=True)
fig.show()
# hot_cold_df = data_df[list(data_df_columns) + ['temp', 'hot']].groupby('hot').agg({k : 'sum' for k in data_df.columns[1:len(data_df_columns)]}).reset_index()
hot_cold_df_t = data_df[list(data_df_columns)[1:] + ['hot']].melt(id_vars=['hot'], var_name='date', value_name='confirmed').fillna('<all>')
fig = px.line(hot_cold_df_t, x="date", y="confirmed", color='hot')
fig.show()
static_columns = ['country', data_df_columns[-1]] + weather_population_columns
data_df_scatter = data_df[static_columns]
data_df_scatter = data_df_scatter.rename(columns={data_df_columns[-1]: 'confirmed'})
data_df_scatter.drop(data_df_scatter[pd.to_numeric(data_df_scatter['Population (2020)'] < 100000)].index).sort_values(by=['confirmed'], ascending=False).head(n=10)
data_norm_df = data_norm_df[static_columns]
data_norm_df = data_norm_df.rename(columns={data_df_columns[-1]: 'confirmed'})
data_norm_df.drop(data_norm_df[pd.to_numeric(data_norm_df['Population (2020)'] < 100000)].index).sort_values(by=['confirmed'], ascending=False).head(n=10)
fig = px.scatter(data_norm_df, x="Urban Pop %", y="confirmed", hover_data=['country'], log_x=True, log_y=True)
fig.show()
fig = px.scatter(data_norm_df, x="Density (P/Km²)", y="confirmed", hover_data=['country'], log_x=True, log_y=True)
fig.show()
fig = px.scatter(data_norm_df, x="Med. Age", y="confirmed", hover_data=['country'], log_x=True, log_y=True)
fig.show()
death_df_static = death_norm_df[static_columns].copy()
death_df_static = death_df_static.rename(columns={data_df_columns[-1]: 'deaths'})
fig = px.scatter(death_df_static, x="Med. Age", y="deaths", hover_data=['country'], log_x=True, log_y=True)
fig.show()
fig = px.scatter(death_df_static, x="Urban Pop %", y="deaths", hover_data=['country'], log_x=True, log_y=True,
size_max=60, color='country', size='Population (2020)')
fig.show()
fig = px.scatter(death_df_static, x="Density (P/Km²)", y="deaths", hover_data=['country'], log_x=True, log_y=True)
fig.show()